import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'notebook'
data_path = './Data/StudentsPerformance.csv'
df = pd.read_csv(data_path)
df.head()
df['gender'].unique()
df['race/ethnicity'].unique()
df['parental level of education'].unique()
df['lunch'].unique()
df['test preparation course'].unique()
Students Performance in Exams dataset has the following columns.
• gender: sex of students - ['female', 'male']
• race/ethnicity: race or ethnicity of the students, which could be categorized into following groups - ['group B', 'group C', 'group A', 'group D', 'group E']
• parental level of education: educational level of the parents, one from the followings - ["bachelor's degree", 'some college', "master's degree", "associate's degree", 'high school', 'some high school']
• lunch: having lunch before test ['standard', 'free/reduced']
• test preparation course: whether or not the students have completed a test preparation course before exams - ['none', 'completed']
• math score: numerical scores representing the performance of the students in a math exam.
• reading score: numerical scores representing the performance of the students in a reading exam.
• writing score: numerical scores representing the performance of the students in a writing exam.
print('Null values count in each column\n\n',df.isna().sum())
print('Duplicate rows count:', df.duplicated().sum())
df.info()
df.describe()
For further exploration, a new column will be added to describe the average score per student based on the math, reading, and writing scores.
df['average score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3
df.head()
Let's delve deeper into the data and examine the impact of various columns on students' performance in exams.
female = df[df['gender'] == 'female']['average score']
male = df[df['gender'] == 'male']['average score']
fig = ff.create_distplot([df['average score'], female, male], ['both','female', 'male'], colors=['slategray','magenta','rgb(0, 0, 100)'], bin_size=3)
fig.show()
fig, ax = plt.subplots(2,2,figsize=(20,15))
plt.subplot(221)
plt.title(label='Edu. Level & Average Score',fontsize=15,fontweight='bold',y=1.03)
sns.barplot(data=df, y='parental level of education', x='average score', palette='Greens_r', order=['master\'s degree','bachelor\'s degree','associate\'s degree', 'some college','some high school','high school'])
plt.subplot(222)
plt.title(label='Lunch & Average Score',fontsize=15,fontweight='bold',y=1.03)
sns.barplot(data=df, x='lunch', y='average score', palette='rocket')
plt.xlabel('')
plt.subplot(223)
plt.title(label='Test prep course & Average Score',fontsize=15,fontweight='bold',y=1.03)
sns.barplot(data=df, x='test preparation course', y='average score', palette='rocket', order=['completed','none'])
plt.xlabel('')
plt.subplot(224)
plt.title(label='Race/ethinicity & Average Score',fontsize=15,fontweight='bold',y=1.03)
sns.barplot(data=df, x='race/ethnicity', y='average score', palette='viridis', order=['group E','group D','group C', 'group B','group A'])
plt.xlabel('')
fig.tight_layout(pad=2)
plt.show()
grouped_gender = df.groupby('gender').mean()
grouped_gender
x_labels = [col.split()[0] for col in grouped_gender.columns]
fig = go.Figure()
fig.add_trace(go.Bar(
x=x_labels,
y=grouped_gender.loc['female'],
name='Female',
marker_color='Magenta'
))
fig.add_trace(go.Bar(
x=x_labels,
y=grouped_gender.loc['male'],
name='Male',
marker_color='Navy'
))
fig.update_layout(title={
'text': "Scores of both genders compared",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'}
)
fig.show()
fig = make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=("Math score", "Reading score", "Writing score"))
colors = ['#003f5c','#58508d','#bc5090','#ff6361','#ffa600']
enthnicity_groups = df.groupby('race/ethnicity').mean()
fig.add_trace(go.Bar(x=df['race/ethnicity'].unique(), y=enthnicity_groups['math score'], marker=dict(color=colors)), 1, 1)
fig.add_trace(go.Bar(x=df['race/ethnicity'].unique(), y=enthnicity_groups['reading score'], marker=dict(color=colors)), 1, 2)
fig.add_trace(go.Bar(x=df['race/ethnicity'].unique(), y=enthnicity_groups['writing score'], marker=dict(color=colors)),1, 3)
fig.update_xaxes(title_text="race/ethnicity", row=1, col=1)
fig.update_xaxes(title_text="race/ethnicity", row=1, col=2)
fig.update_xaxes(title_text="race/ethnicity", row=1, col=3)
fig.update_layout(height=500, width=1000,coloraxis=dict(colorscale='Bluered_r'), showlegend=False)
fig.show()
Insights
fig = make_subplots(
specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}],[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]],
subplot_titles=['Gender', 'Race/enthnicity', 'Lunch', 'Test course', 'Parental education'],
rows=2, cols=3)
fig.add_trace(go.Pie(values=df['gender'].value_counts().values,
labels=df['gender'].value_counts().keys(),
pull=[0.05, 0]), row=1, col=1)
fig.add_trace(go.Pie(values=df['race/ethnicity'].value_counts().values,
labels=df['race/ethnicity'].value_counts().keys(), marker=dict(colors=colors),
pull=[0.05, 0.05, 0.05, 0.05, 0.05]), row=1, col=2)
fig.add_trace(go.Pie(values=df['lunch'].value_counts().values,
labels=df['lunch'].value_counts().keys(),
pull=[0.05, 0]),row=1, col=3)
fig.add_trace(go.Pie(values=df['test preparation course'].value_counts().values,
labels=df['test preparation course'].value_counts().keys(),
pull=[0.05, 0]), row=2, col=1)
fig.add_trace(go.Pie(values=df['parental level of education'].value_counts().values,
labels=df['parental level of education'].value_counts().keys(),
pull=[0.05, 0.05, 0.05, 0.05, 0.05, 0.05]), row=2, col=2)
fig.update_layout(height=600, width=800, showlegend=False)
fig.show()